Scratch work

Code
# IMPORT RELEVANT LIBRARIES
import numpy as np
import pandas as pd
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import numpy as np
import json
import requests

from pathlib import Path
Code
import altair as alt


# Function to download geojson file
response1 = requests.get('https://raw.githubusercontent.com/arcee123/GIS_GEOJSON_CENSUS_TRACTS/master/11.geojson')
# Create background of D.C.
background = alt.Chart(alt.Data(values=response1.json())).mark_geoshape(
        fill="lightgray",
        stroke='white',
        strokeWidth=1
    ).encode(
    ).properties(
        width=600,
        height=600
    )
background
Code
# import the csv
dc_health_df = pd.read_csv('./PLACES__Census_Tract_Data__GIS_Friendly_Format___2022_release (1).csv')

# filter for where StateAbbr = DC
dc_health_df = dc_health_df[dc_health_df['StateAbbr'] == 'DC']
dc_health_df.head()
StateAbbr StateDesc CountyName CountyFIPS TractFIPS TotalPopulation ACCESS2_CrudePrev ACCESS2_Crude95CI ARTHRITIS_CrudePrev ARTHRITIS_Crude95CI ... OBESITY_Crude95CI PHLTH_CrudePrev PHLTH_Crude95CI SLEEP_CrudePrev SLEEP_Crude95CI STROKE_CrudePrev STROKE_Crude95CI TEETHLOST_CrudePrev TEETHLOST_Crude95CI Geolocation
8871 DC District of Columbia District of Columbia 11001 11001007604 3644 9.0 ( 7.0, 11.3) 24.3 (22.9, 25.7) ... (30.2, 34.0) 10.4 ( 9.2, 11.7) 39.0 (37.0, 40.8) 4.8 ( 4.2, 5.5) 15.5 ( 9.8, 22.1) POINT (-76.96379997 38.86610065)
8884 DC District of Columbia District of Columbia 11001 11001005600 6756 5.5 ( 4.4, 7.0) 13.7 (13.0, 14.4) ... (15.6, 18.2) 5.9 ( 5.3, 6.7) 26.2 (24.3, 28.2) 2.0 ( 1.8, 2.2) 7.6 ( 5.1, 10.9) POINT (-77.05385751 38.90021244)
8919 DC District of Columbia District of Columbia 11001 11001008402 2149 5.2 ( 4.1, 6.6) 15.9 (15.2, 16.8) ... (19.8, 22.0) 5.7 ( 5.1, 6.4) 28.8 (27.5, 30.3) 2.4 ( 2.1, 2.8) 7.2 ( 4.1, 14.4) POINT (-76.9910702 38.89861357)
9058 DC District of Columbia District of Columbia 11001 11001001100 4779 3.3 ( 3.0, 3.8) 18.8 (18.1, 19.6) ... (16.0, 17.5) 5.2 ( 4.9, 5.7) 23.6 (22.8, 24.4) 2.0 ( 1.9, 2.2) 3.4 ( 2.6, 5.2) POINT (-77.07767045 38.95722636)
9075 DC District of Columbia District of Columbia 11001 11001008002 3031 4.7 ( 3.8, 6.1) 13.4 (12.7, 14.2) ... (19.4, 21.8) 4.9 ( 4.4, 5.6) 28.9 (27.6, 30.5) 1.8 ( 1.6, 2.0) 5.6 ( 3.3, 9.7) POINT (-76.98275018 38.89154628)

5 rows × 67 columns

Code
import plotly.express as px
import plotly.graph_objects as go

# Ingest GEOJSON file of census tracts in DC
req_dc = requests.get('https://raw.githubusercontent.com/arcee123/GIS_GEOJSON_CENSUS_TRACTS/master/11.geojson')

json_dc = req_dc.json()

hover_cols = ['ACCESS2_CrudePrev', 'ARTHRITIS_CrudePrev']

px.choropleth(dc_health_df,
              geojson=json_dc,
              locations='TractFIPS',

              color='TotalPopulation',
              featureidkey="properties.GEOID",
              scope="usa",
              center={'lat':38.8938005,'lon':-77.1579293},
              hover_data = hover_cols,
              fitbounds="locations",
              height=700,
              title = "DC Population")
Code
# IMPORT RELEVANT LIBRARIES
import numpy as np
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
import numpy as np
import json
import requests
import numpy as np
import scipy.stats
import plotly.subplots as sp
from pathlib import Path
Code
# import the csv
dc_health_df = pd.read_csv('./PLACES__Census_Tract_Data__GIS_Friendly_Format___2022_release (1).csv')

# filter for where StateAbbr = DC
dc_health_df = dc_health_df[dc_health_df['StateAbbr'] == 'DC']
Code
# Resetting defaults and import plotly libraries
import plotly.io as pio
pio.renderers.default = "browser"
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import r2_score


# isolate only columns with CrudePrev in the name
dc_health_df_prev = dc_health_df.filter(regex='CrudePrev')
df = dc_health_df_prev

# Rename columns
df = df.rename(columns={'ACCESS2_CrudePrev': '% of Adults without Health Insurance', 
                        'ARTHRITIS_CrudePrev': '% of Adults with Arthritis', 
                        'BINGE_CrudePrev': '% of Adults who Binge Drink',
                        'BPHIGH_CrudePrev': '% of Adults with High Blood Pressure',
                        'BPMED_CrudePrev': '% of Adults with High Blood Pressure who take Blood Pressure Medication',
                        'CANCER_CrudePrev': '% of Adults who were Diagnosed with Cancer',
                        'CASTHMA_CrudePrev': '% of Adults who were Diagnosed with Asthma',
                        'CERVICAL_CrudePrev': '% of Women who had a Pap Smear in the Past 3 Years',
                        'CHD_CrudePrev': '% of Adults who were Diagnosed with Coronary Heart Disease',
                        'CHECKUP_CrudePrev': '% of Adults who had a Routine Checkup in the Past Year',
                        'CHOLSCREEN_CrudePrev': '% of Adults who had Cholesterol Checked in the Past 5 Years',
                        'COLON_SCREEN_CrudePrev': '% of Adults who had a Colonoscopy or similar test in the Past 10 Years',
                        'COPD_CrudePrev': '% of Adults who were Diagnosed with COPD (Chronic Obstructive Pulmonary Disease)',
                        'COREM_CrudePrev': '% Prevalence of Older Adult Men aged >=65 years who are up to date on preventative health',
                        'COREW_CrudePrev': '% Prevalence of Older Adult Women aged >=65 years who are up to date on preventative health',
                        'CSMOKING_CrudePrev': '% of Adults who Currently Smoke',
                        'DENTAL_CrudePrev': '% of Adults who had a Dental Visit in the Past Year',
                        'DEPRESSION_CrudePrev': '% of Adults who were Diagnosed with Depression',
                        'DIABETES_CrudePrev': '% of Adults who were Diagnosed with Diabetes',
                        'GHLTH_CrudePrev': '% of Adults who reported their Health as not Good',
                        'HIGHCHOL_CrudePrev': '% of Adults who were Diagnosed with High Cholesterol',
                        'KIDNEY_CrudePrev': '% of Adults who were Diagnosed with Kidney Disease',
                        'LPA_CrudePrev': '% of Adults who are Physically Inactive', 
                        'MAMMOUSE_CrudePrev': '% Women aged 50-74 years who had a Mammogram in the Past 2 Years',
                        'MHLTH_CrudePrev': '% of Adults who reported their Mental Health as not Good',
                        'OBESITY_CrudePrev': '% of Adults who were Obese',
                        'PHLTH_CrudePrev': '% of Adults who reported their Physical Health as not Good',
                        'SLEEP_CrudePrev': '% of Adults who reported their Sleep as not Good',
                        'STROKE_CrudePrev': '% of Adults who were Diagnosed with Stroke',
                        'TEETHLOST_CrudePrev': '% of Adults who have lost all of their Natural Teeth'})

# list of health metrics for drop down menu
column_names = df.columns

# Creating the initial scatter plot
fig = go.Figure(go.Scatter(x=df[column_names[0]], y=df[column_names[1]], mode='markers'))

# Label axes
fig.update_xaxes(title_text='X Axis')
fig.update_yaxes(title_text='Y Axis')

# Setting the range for x and y axes
fig.update_xaxes(range=[0, 100])
fig.update_yaxes(range=[0, 100])

for col in column_names:
    for col2 in column_names:
        x = df[col]
        y = df[col2]
        fig.add_trace(go.Scatter(x=x, y=y, mode='markers', name=col + ' vs ' + col2, showlegend=False, visible=False))


# Update the visibility of the traces
        

def update_visibility(selected_col, selected_col2):
    for i, trace in enumerate(fig.data):
        if trace.name == selected_col + ' vs ' + selected_col2:
            trace.visible = True
        elif trace.name == selected_col + ' vs ' + selected_col2 + ' Best Fit':
            trace.visible = True
        else:
            trace.visible = False

# Create the drop-down menus for x (col) and y (col2) axes of the scatter plot
col_dropdown = [{'label': col, 'value': col} for col in column_names]
col2_dropdown = [{'label': col2, 'value': col2} for col2 in column_names]

# #Define the dropdown menu for x-axis
button_layer_1_height = 1.08
x_axis_dropdown = go.layout.Updatemenu(
    buttons=list([dict(args=[{'x': [df[col]]}, update_visibility(col, col2)], label=col, method='update') for col in column_names]),
    direction="down",
    pad={"r": 10, "t": 10},
    showactive=True,
    x=0.06,
    xanchor="left",
    y=button_layer_1_height + 0.05,
    yanchor="top"
)



# Define the dropdown menu for y-axis
y_axis_dropdown = go.layout.Updatemenu(
    buttons=list([dict(args=[{'y': [df[col2]]}, update_visibility(col, col2)], label=col2, method='update') for col2 in column_names]),
    direction="down",
    pad={"r": 10, "t": 10},
    showactive=True,
    x=0.06,
    xanchor="left",
    y=button_layer_1_height,
    yanchor="top"
)




# Update the layout to include the dropdown menus
fig.update_layout(
    updatemenus=[x_axis_dropdown, y_axis_dropdown]
)

# Label axes
fig.update_xaxes(title_text='X Axis')
fig.update_yaxes(title_text='Y Axis')

# Setting the range for x and y axes
fig.update_xaxes(range=[0, 100])
fig.update_yaxes(range=[0, 100])

# Update plot sizing
fig.update_layout(
    width=900,
    height=900,
    autosize=False,
    #margin=dict(t=100, b=0, l=0, r=0),
)

# add annotations
fig.update_layout(
    annotations=[
        dict(
            text="X Axis:",
            x=0,
            xref="paper",
            y=button_layer_1_height + 0.025,
            yref="paper",
            align="left",
            showarrow=False
        ),
        dict(
            text="Y Axis:",
            x=0,
            xref="paper",
            y=button_layer_1_height - 0.025,
            yref="paper",
            align="left",
            showarrow=False
        )
    ]
)


# Change background color to grey
fig.update_layout(
    plot_bgcolor='rgb(230, 230, 230)'
)

# Change scatter point color to red
fig.update_traces(
    marker=dict(color='red')
)

# Change font to Calibri
fig.update_layout(
    font=dict(family='Proxima Nova')
)

# # # Create a function to update the visibility of the traces based on selected columns
# def update_visibility(selected_col, selected_col2):
#     for i, trace in enumerate(fig.data):
#         trace.visible = (trace.name == selected_col + ' vs ' + selected_col2)
#         trace.visible = (trace.name == selected_col + ' vs ' + selected_col2 + ' Best Fit')




# Display the scatter plot with dropdown menus
fig.show()
Code
import numpy as np

x = df['% of Adults who Binge Drink']
y = df['% of Adults who reported their Physical Health as not Good']

# Find the coefficients of the linear regression line (i.e., the line of best fit)
coefficients = np.polyfit(x, y, 1)

# Create the line of best fit using the coefficients
line = np.poly1d(coefficients)

# Calculate the R-squared value
r2 = np.corrcoef(x, y)[0, 1]**2

print(line)
print(r2)
 
-0.6238 x + 20.94
0.7112826515828994
Code
# Import walkability data

df_walk = pd.read_csv('./joined_depression_cre_walkability.csv')
Code
dc_health_df.rename(columns={'TractFIPS': 'census_tract'}, inplace=True)
df_walk.rename(columns={'geoid_tract_20': 'census_tract'}, inplace=True)
Code
# Merge the two dataframes
df_merged = pd.merge(dc_health_df, df_walk, on='census_tract', how='left')
Code
df_merged.head()
StateAbbr StateDesc CountyName CountyFIPS census_tract TotalPopulation ACCESS2_CrudePrev ACCESS2_Crude95CI ARTHRITIS_CrudePrev ARTHRITIS_Crude95CI ... no_health_insurance_perc male_perc female_perc income_inequality_gini_index hs_grad_perc households_no_vehicle_perc households_w_internet_perc walkability_score GEOID neighborhood_name
0 DC District of Columbia District of Columbia 11001 11001007604 3644 9.0 ( 7.0, 11.3) 24.3 (22.9, 25.7) ... 3.4 44.1 55.9 0.5617 88.7 33.7 76.6 11.666667 1.100101e+10 NAYLOR/HILLCREST
1 DC District of Columbia District of Columbia 11001 11001005600 6756 5.5 ( 4.4, 7.0) 13.7 (13.0, 14.4) ... 2.2 50.6 49.4 0.5699 99.0 60.6 91.4 15.916667 1.100101e+10 GWU
2 DC District of Columbia District of Columbia 11001 11001008402 2149 5.2 ( 4.1, 6.6) 15.9 (15.2, 16.8) ... 3.6 43.2 56.8 0.4920 96.6 38.6 86.9 15.500000 1.100101e+10 UNION STATION
3 DC District of Columbia District of Columbia 11001 11001001100 4779 3.3 ( 3.0, 3.8) 18.8 (18.1, 19.6) ... 2.3 44.0 56.0 0.5019 99.3 24.1 88.2 17.750000 1.100100e+10 TENLEYTOWN
4 DC District of Columbia District of Columbia 11001 11001008002 3031 4.7 ( 3.8, 6.1) 13.4 (12.7, 14.2) ... 0.3 46.8 53.2 0.4067 95.9 18.7 85.2 15.583333 1.100101e+10 KINGMAN PARK

5 rows × 84 columns

Code
# Resetting defaults and import plotly libraries
import plotly.io as pio
pio.renderers.default = "browser"
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

# isolate only columns with CrudePrev in the name
dc_health_df_prev = df_merged.filter(regex='CrudePrev')
# add the walkability_score column back in
dc_health_df_prev['walkability_score'] = df_merged['walkability_score']
df = dc_health_df_prev

# Rename columns
df = df.rename(columns={'ACCESS2_CrudePrev': '% of Adults without Health Insurance', 
                        'ARTHRITIS_CrudePrev': '% of Adults with Arthritis', 
                        'BINGE_CrudePrev': '% of Adults who Binge Drink',
                        'BPHIGH_CrudePrev': '% of Adults with High Blood Pressure',
                        'BPMED_CrudePrev': '% of Adults with High Blood Pressure who take Blood Pressure Medication',
                        'CANCER_CrudePrev': '% of Adults who were Diagnosed with Cancer',
                        'CASTHMA_CrudePrev': '% of Adults who were Diagnosed with Asthma',
                        'CERVICAL_CrudePrev': '% of Women who had a Pap Smear in the Past 3 Years',
                        'CHD_CrudePrev': '% of Adults who were Diagnosed with Coronary Heart Disease',
                        'CHECKUP_CrudePrev': '% of Adults who had a Routine Checkup in the Past Year',
                        'CHOLSCREEN_CrudePrev': '% of Adults who had Cholesterol Checked in the Past 5 Years',
                        'COLON_SCREEN_CrudePrev': '% of Adults who had a Colonoscopy or similar test in the Past 10 Years',
                        'COPD_CrudePrev': '% of Adults who were Diagnosed with COPD (Chronic Obstructive Pulmonary Disease)',
                        'COREM_CrudePrev': '% Prevalence of Older Adult Men aged >=65 years who are up to date on preventative health',
                        'COREW_CrudePrev': '% Prevalence of Older Adult Women aged >=65 years who are up to date on preventative health',
                        'CSMOKING_CrudePrev': '% of Adults who Currently Smoke',
                        'DENTAL_CrudePrev': '% of Adults who had a Dental Visit in the Past Year',
                        'DEPRESSION_CrudePrev': '% of Adults who were Diagnosed with Depression',
                        'DIABETES_CrudePrev': '% of Adults who were Diagnosed with Diabetes',
                        'GHLTH_CrudePrev': '% of Adults who reported their Health as not Good',
                        'HIGHCHOL_CrudePrev': '% of Adults who were Diagnosed with High Cholesterol',
                        'KIDNEY_CrudePrev': '% of Adults who were Diagnosed with Kidney Disease',
                        'LPA_CrudePrev': '% of Adults who are Physically Inactive', 
                        'MAMMOUSE_CrudePrev': '% Women aged 50-74 years who had a Mammogram in the Past 2 Years',
                        'MHLTH_CrudePrev': '% of Adults who reported their Mental Health as not Good',
                        'OBESITY_CrudePrev': '% of Adults who were Obese',
                        'PHLTH_CrudePrev': '% of Adults who reported their Physical Health as not Good',
                        'SLEEP_CrudePrev': '% of Adults who reported their Sleep as not Good',
                        'STROKE_CrudePrev': '% of Adults who were Diagnosed with Stroke',
                        'TEETHLOST_CrudePrev': '% of Adults who have lost all of their Natural Teeth'})

# list of health metrics for drop down menu
column_names = df.columns

# Creating the initial scatter plot
fig = go.Figure(go.Scatter(x=df[column_names[0]], y=df[column_names[1]], mode='markers'))

# Label axes
fig.update_xaxes(title_text='Walkability Score')
fig.update_yaxes(title_text='Y Axis')

# Setting the range for x and y axes
#fig.update_xaxes(range=[0, 100])
fig.update_xaxes(range=[0, max(df['walkability_score'])])
fig.update_yaxes(range=[0, 100])

for col in column_names:
    fig.add_trace(go.Scatter(x=df['walkability_score'], y=df[col], mode='markers', name='Walkability vs ' + col, visible=False))


def update_visibility(selected_col, selected_col2):
    return [(trace.name == selected_col + ' vs ' + selected_col2) for trace in fig.data]


# Create the drop-down menus for x (col) and y (col2) axes of the scatter plot
col_dropdown = [{'label': col, 'value': col} for col in column_names]

# Define the dropdown menu for x-axis
button_layer_1_height = 1.08

y_axis_dropdown = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[
                {"y": [df[col]], "visible": [(trace.name == "Walkability vs " + col) for trace in fig.data]}
            ],
            label=col,
            method="update",
        ) for col in column_names
    ]),
    direction="down",
    pad={"r": 10, "t": 10},
    showactive=True,
    x=0.06,
    xanchor="left",
    y=button_layer_1_height,
    yanchor="top"
)




# Update the layout to include the dropdown menus
fig.update_layout(
    updatemenus=[y_axis_dropdown]
)

# Label axes
fig.update_xaxes(title_text='X Axis')
fig.update_yaxes(title_text='Y Axis')



# Update plot sizing
fig.update_layout(
    width=900,
    height=900,
    autosize=False,
    #margin=dict(t=100, b=0, l=0, r=0),
)

# add annotations
fig.update_layout(
    annotations=[
        dict(
            text="X Axis: Walkability Score of the Neighborhood",
            x=0,
            xref="paper",
            y=button_layer_1_height + 0.025,
            yref="paper",
            align="left",
            showarrow=False
        ),
        dict(
            text="Y Axis:",
            x=0,
            xref="paper",
            y=button_layer_1_height - 0.025,
            yref="paper",
            align="left",
            showarrow=False
        )
    ]
)


def update_visibility(selected_col, selected_col2):
    return [(trace.name == selected_col + ' vs ' + selected_col2) for trace in fig.data]


# Change background color to grey
fig.update_layout(
    plot_bgcolor='rgb(230, 230, 230)'
)

# Change scatter point color to red
fig.update_traces(
    marker=dict(color='red')
)

# Change font to Calibri
fig.update_layout(
    font=dict(family='Proxima Nova')
)


# Display the scatter plot with dropdown menus
fig.show()
/var/folders/xw/lth4lxb57psc5_ks7_nq1zj40000gn/T/ipykernel_41309/2832497246.py:12: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy